rm(list=ls(all=TRUE)) # clear workspace
graphics.off() # closes all graphics

Thanks due to https://huntsmancancerinstitute.github.io/hciR/pasilla_DESeq.html

This guide follows the Bioconductor RNA-Seq workflow to find differentially expressed genes using DESeq2. Load the hciR package to run the R code.

Load samples and counts

Load the counts and samples using the readr package.

packages(readr)

packages(microbiome)
samples <- read_tsv("C:/Users/Ryan/Google Drive/UC Davis/Publications/MFC Microbiome Paper - Experiment 15/metadata_frd_exp15.txt")
counts  <- read_tsv("metagenome.tab", quote = "")
#  # A tibble: 6 x 19
#    OTU_ID `12-MS515F` `1-MS515F` `13-MS515F` `10-MS515F` `2-MS515F` `3-MS515F` `4-MS515F` `5-MS515F` `6-MS515F`
#     <chr>       <int>      <int>       <int>       <int>      <int>      <int>      <int>      <int>      <int>
#  1 K01365           0          0           0           0          0          0          0          0          0
#  2 K01364           0          0           0           0          0          0          0          0          0
#  3 K01361           4         11           2           6          0         14          0          7          6
#  4 K01360           0          0           0           0          0          0          0          0          0
#  5 K01362       18555      19442       18887       18768      11216      18297      23687      17577      20485
#  6 K02249           0          0           0           2          0          1          0          0          0
#  # ... with 9 more variables: `7-MS515F` <int>, `9-MS515F` <int>, `11-MS515F` <int>, `14-MS515F` <int>,
#  #   `8-MS515F` <int>, `17-MS515F` <int>, `15-MS515F` <int>, `18-MS515F` <int>, `16-MS515F` <int>

Remove features with zero counts and features with one or fewer reads in any sample.

packages(dplyr)
counts <- filter_counts(counts)
#  Removed 1254 features with 0 reads
#  Removed 127 features with <=1 maximum reads
head(counts)
#  # A tibble: 6 x 19
#    OTU_ID `12-MS515F` `1-MS515F` `13-MS515F` `10-MS515F` `2-MS515F` `3-MS515F` `4-MS515F` `5-MS515F` `6-MS515F`
#     <chr>       <int>      <int>       <int>       <int>      <int>      <int>      <int>      <int>      <int>
#  1 K01361           4         11           2           6          0         14          0          7          6
#  2 K01362       18555      19442       18887       18768      11216      18297      23687      17577      20485
#  3 K02249           0          0           0           2          0          1          0          0          0
#  4 K05841           1          0           0           0          0          0         10          0          0
#  5 K05844        3836        272        2208        1166       1544        781        158        844       1119
#  6 K05845        4415      16593        9487       11056       7075      12118      17639      12320      13951
#  # ... with 9 more variables: `7-MS515F` <int>, `9-MS515F` <int>, `11-MS515F` <int>, `14-MS515F` <int>,
#  #   `8-MS515F` <int>, `17-MS515F` <int>, `15-MS515F` <int>, `18-MS515F` <int>, `16-MS515F` <int>

Filter data for classes of interest

which.cols <- unlist(samples[which(samples$Sample == "Bristles"|samples$Sample == "Cathode"),"Label"])
counts <- as_tibble(cbind(counts$OTU_ID,counts[,which(colnames(counts) %in% which.cols)]))
names(counts)[names(counts) == 'counts$OTU_ID'] <- 'OTU_ID'
counts$OTU_ID <- as.character(counts$OTU_ID)
head(counts)
#  # A tibble: 6 x 9
#    OTU_ID `12-MS515F` `13-MS515F` `11-MS515F` `14-MS515F` `17-MS515F` `15-MS515F` `18-MS515F` `16-MS515F`
#     <chr>       <int>       <int>       <int>       <int>       <int>       <int>       <int>       <int>
#  1 K01361           4           2           5           5          11          13          40          15
#  2 K01362       18555       18887       25754       21057       27253       22671       32389       31287
#  3 K02249           0           0           0           0           2           2           2           1
#  4 K05841           1           0           0           0           3          12           9          68
#  5 K05844        3836        2208        6645        4544        9622       14822       14065       14567
#  6 K05845        4415        9487        4423        3739       10177        2021        3788        5100

samples <- samples[which(samples$Sample == "Bristles"|samples$Sample == "Cathode"),]
samples
#  # A tibble: 8 x 6
#        Label                      Name   Sample Temperature Substrate  Aeration
#        <chr>                     <chr>    <chr>       <chr>     <chr>     <chr>
#  1 11-MS515F MFC 1 EXP 15 BRISTLES FRD Bristles         30C    Sludge Anaerobic
#  2 12-MS515F MCF 2 EXP 15 BRISTLES FRD Bristles         30C    Sludge Anaerobic
#  3 13-MS515F MFC 3 EXP 15 BRISTLES FRD Bristles         30C    Sludge Anaerobic
#  4 14-MS515F MFR 4 EXP 15 BRISTLES FRD Bristles         30C    Sludge Anaerobic
#  5 15-MS515F  MFC 1 EXP 15 CATHODE FRD  Cathode         30C    Sludge Anaerobic
#  6 16-MS515F  MFC 2 EXP 15 CATHODE FRD  Cathode         30C    Sludge Anaerobic
#  7 17-MS515F  MFC 3 EXP 15 CATHODE FRD  Cathode         30C    Sludge Anaerobic
#  8 18-MS515F  MFC 4 EXP 15 CATHODE FRD  Cathode         30C    Sludge Anaerobic

Run DESeq

Combine the counts and samples to create a DESeqDataSet object and calculate the regularized log transform (rlog) for sample visualizations.

packages(lazyeval)
packages(DESeq2)

names(samples)[names(samples) == 'Sample'] <- 'condition'

dds <- deseq_from_tibble(counts, samples,  design = ~ condition)
rld <- r_log(dds)

PCA plot

Plot the first two principal components using the rlog values from the top 500 variable genes. You can hover over points to view sample names or zoom into groups of points in this interactive highchart.

plot_pca(rld, "condition", tooltip=c("Label") , width=700)

Cluster all the rlog values using the R function dist to calculate the Euclidean distance between samples.

plot_dist(rld , c("condition"), palette="Reds", diagNA=FALSE, reverse_pal=FALSE)

Gene annotations

Load/create annotations.

fly <- as_tibble(gene_name_lookup)
colnames(fly) <- c("id","gene_name")
fly <- fly[order(fly$id),]
#add some NA columns
namevector <- c("biotype", "chromosome", "description")
fly[ , namevector] <- NA

head(fly)
#  # A tibble: 6 x 5
#        id                                                                        gene_name biotype chromosome
#     <chr>                                                                            <chr>   <lgl>      <lgl>
#  1 K00001                                               alcohol dehydrogenase [EC:1.1.1.1]      NA         NA
#  2 K00002                                       alcohol dehydrogenase (NADP+) [EC:1.1.1.2]      NA         NA
#  3 K00003                                            homoserine dehydrogenase [EC:1.1.1.3]      NA         NA
#  4 K00004 "\"(R,R)-butanediol dehydrogenase / diacetyl reductase [EC:1.1.1.4 1.1.1.303]\""      NA         NA
#  5 K00005                                              glycerol dehydrogenase [EC:1.1.1.6]      NA         NA
#  6 K00007                                       D-arabinitol 4-dehydrogenase [EC:1.1.1.11]      NA         NA
#  # ... with 1 more variables: description <lgl>

Result tables

Get the annotated DESeq results using a 5% false discovery rate (FDR). The default is to compare all treatment combinations, which only includes one in this study.

Since we downloaded the most recent fly annotations, there are 723 missing (note you could set the version in read_biomart to match the old count table)

res <- results_all(dds, fly, alpha= 0.05)
#  Using adjusted p-value < 0.05
#  1. Bristles vs. Cathode: 1595 up and 1298 down regulated
#  Note: 5528 rows in results are missing from biomart table

Browse results

Create a flex dashboard using the top 2000 genes sorted by p-value in pasilla_flex.html. The MA-plot and volcano plot are linked, so you can click and drag to create a box to highlight points and then view matching rows in the table. You can also drag the box around the plot to easily highlight other points and rows. In addition, you can search for genes in the table using the search box, but need to click on the row in order to highlight the points in the plots. The sliders can also be used to limit the results.

# packages(rmarkdown)
# packages(flexdashboard)
# packages(crosstalk)
# packages(d3scatter)
# rmd <- system.file("Rmd", "DESeq_flex.Rmd", package="hciR")
# render(rmd, output_file="pasilla_flex.html",  output_dir=".",
#          params=list( results= res, title = "Treatments", top= 2000 ))

Save results

Save the DESeq results to a single Excel file in pasilla_DESeq.xlsx. The write_deseq function will also output raw counts, rlog values, normalized counts, samples and fly annotations.

# write_deseq(res, dds, rld, fly, file="pasilla_DESeq.xlsx")

Plots

Plot the fold changes and p-values in an interactive volcano plot.

plot_volcano(res, padj = 1e-10, log2Fold =1 , width=700)
#  Adding mouseover labels to 3256 genes (62.4%)


Select the top 40 genes sorted by p-value and cluster the rlog differences, so values in the heatmap represent the amount a gene deviates in a specific sample from the gene’s average across all samples.


x <- top_counts( res, rld, top=40)
x
#  # A tibble: 40 x 9
#                                                                           id `11-MS515F` `12-MS515F` `13-MS515F`
#                                                                        <chr>       <dbl>       <dbl>       <dbl>
#   1                                       hydroxylamine oxidase [EC:1.7.3.4]    12.04587    11.36003    11.26504
#   2                       "\"1,3-propanediol dehydrogenase [EC:1.1.1.202]\""    12.10677    11.45461    11.32907
#   3                            nucleoside-triphosphatase THEP1 [EC:3.6.1.15]    11.08803    10.40032    10.34029
#   4                                     prephenate dehydratase [EC:4.2.1.51]    11.04784    10.34394    10.27744
#   5                                       thiosulfate reductase [EC:1.-.-.-]    11.11512    10.42496    10.36069
#   6                                                                     None    11.08552    10.41803    10.31882
#   7                glycerol-3-phosphate dehydrogenase subunit C [EC:1.1.5.3]    11.07905    10.41837    10.32025
#   8 "\"LysR family transcriptional regulator, positive regulator for ilvC\""    11.03326    10.32951    10.25655
#   9                                 nitrogenase molybdenum-iron protein NifN    11.21569    10.60965    10.46374
#  10                                                     putative transposase    11.26020    10.70259    10.48936
#  # ... with 30 more rows, and 5 more variables: `14-MS515F` <dbl>, `15-MS515F` <dbl>, `16-MS515F` <dbl>,
#  #   `17-MS515F` <dbl>, `18-MS515F` <dbl>
plot_genes(x, c("condition") )


Plot the top 400 genes using an interactive d3heatmap. Click and drag over a region in the plot to zoom and better view gene labels.

plot_genes( top_counts( res, rld, top=400), output="d3", xaxis_font_size=12, show_grid=FALSE)